図の右上のshowボタンを押すとRのコードが表示されます。
Rでレーダーチャートを描くパッケージはいくつかあるのだが、どれもイマイチ納得いかなかったのでポーラーチャートで勘弁して下さい。
library(conflicted)
library(tidyverse)
library(patchwork)
data1 <- data.frame(
person = c("Aさん", "Bさん"),
数学 = c(5, 2),
国語 = c(5, 3),
理科 = c(5, 4),
社会 = c(4, 3),
体育 = c(4, 4),
音楽 = c(5, 2)
) |>
pivot_longer(!person)
# compelling data
data2 <- data.frame(
person = c('Aさん', 'Bさん'),
数学 = c(5, 5),
国語 = c(5, 1),
理科 = c(1, 5),
社会 = c(1, 1),
体育 = c(1, 5),
音楽 = c(5, 1)
) |>
pivot_longer(!person)
# 並び1
p1 <- data1 |>
mutate(name = factor(name, levels = c("数学", "国語", "理科", "社会", "体育", "音楽"))) |>
ggplot(aes(x = name, y = value, fill = person, group = person)) +
geom_col(position = position_dodge(width=1)) +
coord_polar() +
theme(
legend.position="none",
axis.title.x=element_blank(),
axis.title.y=element_blank()
)
# 並び2
p2 <- data1 |>
mutate(name = factor(name, levels = c("音楽", "数学", "理科", "体育", "社会", "国語"))) |>
ggplot(aes(x = name, y = value, fill = person, group = person)) +
geom_col(position = position_dodge(width=1)) +
coord_polar() +
theme(
legend.position="none",
axis.title.x=element_blank(),
axis.title.y=element_blank()
)
# 並び1
p3 <- data2 |>
mutate(name = factor(name, levels = c("数学", "国語", "理科", "社会", "体育", "音楽"))) |>
ggplot(aes(x = name, y = value, fill = person, group = person)) +
geom_col(position = position_dodge(width=1)) +
coord_polar() +
theme(
legend.position="none",
axis.title.x=element_blank(),
axis.title.y=element_blank()
)
# 並び2
p4 <- data2 |>
mutate(name = factor(name, levels = c("音楽", "数学", "理科", "体育", "社会", "国語"))) |>
ggplot(aes(x = name, y = value, fill = person, group = person)) +
geom_col(position = position_dodge(width=1)) +
coord_polar() +
theme(
legend.position="none",
axis.title.x=element_blank(),
axis.title.y=element_blank()
)
(p1 + p2) / (p3 + p4)書籍には無いが、品種ごとに分けた可視化もおまけで付けました。
library(conflicted)
library(tidyverse)
library(patchwork)
v_names <- c(
"ブドウの品種", "アルコール度数", "リンゴ酸", "ミネラル分",
"ミネラル分のアルカリ度", "マグネシウム", "全フェノール類", "フラバノイド",
"非フラバノイドフェノール類", "プロアントシアニン", "色の強さ", "色相",
"OD280/OD315値", "プロリン"
)
df <- read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
col_names = v_names,
col_types = "f"
) |>
mutate(across(where(is.numeric), \(x) scale(x))) |> #標準化
rowid_to_column() |>
pivot_longer(!c(ブドウの品種, rowid)) |>
mutate(name = factor(name, levels = v_names))
df |>
ggplot(aes(x = name, y = value, colour = ブドウの品種, group =rowid)) +
geom_line(alpha = 0.5) +
coord_flip() +
labs(title = "ワインの特徴とブドウの品種", y = "相対スコア", x = "")df |>
ggplot(aes(x = name, y = value, colour = ブドウの品種, group =rowid)) +
geom_line(alpha = 0.5) +
coord_flip() +
facet_wrap(vars(ブドウの品種)) +
labs(title = "ワインの特徴とブドウの品種", y = "相対スコア", x = "")library(conflicted)
library(tidyverse)
v_names <- c(
"ブドウの品種", "アルコール度数", "リンゴ酸", "ミネラル分",
"ミネラル分のアルカリ度", "マグネシウム", "全フェノール類", "フラバノイド",
"非フラバノイドフェノール類", "プロアントシアニン", "色の強さ", "色相",
"OD280/OD315値", "プロリン"
)
df <- read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
col_names = v_names,
col_types = "f"
) |>
mutate(across(where(is.numeric), \(x) scale(x))) |> #標準化
rowid_to_column() |>
pivot_longer(!c(ブドウの品種, rowid)) |>
mutate(name = factor(name, levels = v_names))
p1 <- df |>
ggplot(aes(x = name, y = value, group = ブドウの品種, fill = ブドウの品種)) +
stat_summary(geom = "bar", fun = "mean", position = "dodge2") +
stat_summary(geom = "errorbar", fun.data = "mean_se", position = "dodge2") +
labs(y = "相対スコア", x = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <- df |>
ggplot(aes(x = name, y = value, fill = ブドウの品種)) +
geom_violin(position="dodge") +
labs(y = "相対スコア", x = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p1 / p2library(conflicted)
library(tidyverse)
v_names <- c(
"ブドウの品種", "アルコール度数", "リンゴ酸", "ミネラル分",
"ミネラル分のアルカリ度", "マグネシウム", "全フェノール類", "フラバノイド",
"非フラバノイドフェノール類", "プロアントシアニン", "色の強さ", "色相",
"OD280/OD315値", "プロリン"
)
df <- read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
col_names = v_names,
col_types = "f"
) |>
mutate(across(where(is.numeric), \(x) scale(x))) |> #標準化
rowid_to_column() |>
pivot_longer(!c(rowid, ブドウの品種)) |>
mutate(name = factor(name, levels = rev(v_names)))
jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
df |>
ggplot(aes(x = rowid, y = name, fill = value)) +
geom_tile() +
facet_wrap(vars(ブドウの品種), scales = "free_x") +
scale_fill_gradientn(colours = jet.colors(100)) +
labs(x = "ワイン銘柄番号", y = "", title = "カラーコードで値を表現する")library(conflicted)
library(tidyverse)
df <- read_csv(
"https://raw.githubusercontent.com/tkEzaki/data_visualization/main/4%E7%AB%A0/data/behavior_data.csv"
) |>
mutate(Time = Time / 3600) |> # Time列を秒から時間に変換
pivot_longer(!Time) |>
mutate(
value = case_when(
value == "Garbage" ~ "ゴミ捨て場",
value == "Nest" ~ "寝室",
value == "Other" ~ "一般の部屋",
value == "Toilet" ~ "トイレ"
) |>
factor(levels = c("ゴミ捨て場", "トイレ", "寝室", "一般の部屋"))
)
df |>
ggplot(aes(x =name , y = Time, fill = value)) +
geom_tile() +
scale_y_reverse() +
scale_fill_brewer(palette = "Set1") +
labs(title = "ヒートマップによる行動時系列の可視化", x = "個体", y = "時間[h]")library(conflicted)
library(tidyverse)
library(patchwork)
# CSVデータを読み込む
df <- read_csv(
"https://raw.githubusercontent.com/tkEzaki/data_visualization/main/4%E7%AB%A0/data/behavior_data.csv"
) |>
mutate(Time = Time / 3600) |> # Time列を秒から時間に変換
pivot_longer(!Time) |>
mutate(
value = case_when(
value == "Garbage" ~ "ゴミ捨て場",
value == "Nest" ~ "寝室",
value == "Other" ~ "一般の部屋",
value == "Toilet" ~ "トイレ"
) |>
factor(levels = rev(c("ゴミ捨て場", "トイレ", "寝室", "一般の部屋")))
)
# 1個体(J)のデータ
df_j <- df |>
dplyr::filter(name == "J") |>
dplyr::select(!name) |>
mutate(action = "1")
p1 <- df_j |>
ggplot(aes(x = value, y = Time, fill = value)) +
geom_tile() +
scale_y_reverse() +
labs(x = "", y = "時間[h]") +
scale_fill_brewer(palette = "Set1") +
theme(
legend.position="none",
aspect.ratio = 4 / 1.5
)
p2 <- df_j |>
dplyr::filter(Time >= 13, Time <= 16) |>
ggplot(aes(x = value, y = Time, fill = value)) +
geom_tile() +
scale_y_reverse() +
labs(x = "", y = "時間[h]") +
scale_fill_brewer(palette = "Set1") +
theme(
legend.position="none",
aspect.ratio = 4 / 1.5
)
p1 + p2library(conflicted)
library(tidyverse)
library(patchwork)
# 共著データ
researchers <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
collaboration <- matrix(
c(
NA, 1, 1, 0, 0, 0, 0, 0, 0, 1,
1, NA, 0, 1, 1, 1, 0, 0, 0, 1,
1, 0, NA, 0, 0, 0, 1, 1, 1, 0,
0, 1, 0, NA, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, NA, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, NA, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, NA, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, NA, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, NA, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, NA
),
nrow=10,
dimnames = list(researchers, researchers)
)
p1 <- collaboration |>
as.data.frame() |>
rownames_to_column() |>
pivot_longer(!rowname) |>
mutate(
value = factor(value),
name = factor(name, levels = rev(LETTERS[1:10]))
) |>
ggplot(aes(x = rowname, y = name, fill = value, label = value)) +
geom_tile() +
geom_text() +
scale_fill_hue(name = "", labels = c("0" = "共著なし", "1" ="共著あり")) +
labs(title = "共著関係の有無", x = "", y = "") +
theme(aspect.ratio = 1)
# ここはもうちょっとスマートに書けないか?
set.seed(0)
v0 <- runif(n = (100 -10)/2, min = 0.0, max = 0.5) |> round(2)
v1 <- runif(n = (100 -10)/2, min = 0.5, max = 1.0) |> round(2)
collaboration_score <- collaboration
collaboration_score[upper.tri(collaboration_score)] <-
if_else(collaboration[lower.tri(collaboration)]== 1, v1, v0)
collaboration_score <- t(collaboration_score)
collaboration_score[upper.tri(collaboration_score)] <-
if_else(collaboration[lower.tri(collaboration)]== 1, v1, v0)
jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
p2 <- collaboration_score |>
as.data.frame() |>
rownames_to_column() |>
pivot_longer(!rowname) |>
mutate(name = factor(name, levels = rev(LETTERS[1:10]))) |>
ggplot(aes(x = rowname, y = name, fill = value, label = value)) +
geom_tile() +
geom_text(size = 2) +
labs(title = "研究の興味の類似度", x = "", y = "") +
theme(aspect.ratio = 1) +
scale_fill_gradientn(colours = jet.colors(100))
p1 + p2library(conflicted)
library(tidyverse)
library(igraph)
# 共著データ
researchers <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
collaboration <- matrix(
c(
NA, 1, 1, 0, 0, 0, 0, 0, 0, 1,
1, NA, 0, 1, 1, 1, 0, 0, 0, 1,
1, 0, NA, 0, 0, 0, 1, 1, 1, 0,
0, 1, 0, NA, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, NA, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, NA, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, NA, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, NA, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, NA, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, NA
),
nrow=10,
dimnames = list(researchers, researchers)
)
# 研究の興味の類似度
set.seed(0)
v0 <- runif(n = (100 -10)/2, min = 0.0, max = 0.5) |> round(2)
v1 <- runif(n = (100 -10)/2, min = 0.5, max = 1.0) |> round(2)
collaboration_score <- collaboration
collaboration_score[upper.tri(collaboration_score)] <-
if_else(collaboration[lower.tri(collaboration)]== 1, v1, v0)
collaboration_score <-
t(collaboration_score)
collaboration_score[upper.tri(collaboration_score)] <-
if_else(collaboration[lower.tri(collaboration)]== 1, v1, v0)
# 共著ネットワーク
diag(collaboration) <- 0
collabo_g <- collaboration |>
graph_from_adjacency_matrix(mode = "undirected")
set.seed(0)
par(mfrow = c(2, 2))
collabo_g |>
plot(layout = layout_in_circle)
collabo_g |>
plot()
# 類似度ネットワーク
diag(collaboration_score) <- 0
score_g <- collaboration_score |>
graph_from_adjacency_matrix(mode = "undirected", weighted = TRUE)
jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
col_p <- round((E(score_g)$weight / max(E(score_g)$weight)) * 100, 0)
set.seed(0)
collabo_g |>
plot(
layout = layout_in_circle,
edge.width = E(score_g)$weight * 10,
edge.color = jet.colors(100)[col_p]
)
score_g |>
plot(
edge.width = E(score_g)$weight * 10,
edge.color = jet.colors(100)[col_p]
)library(conflicted)
library(tidyverse)
# 研究者リスト
researchers <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
# 共著データ
collaboration <- matrix(
c(
NA, 1, 1, 0, 0, 0, 0, 0, 0, 1,
0, NA, 0, 1, 1, 1, 0, 0, 0, 0,
0, 0, NA, 0, 0, 0, 1, 1, 1, 0,
0, 0, 0, NA, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, NA, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, NA, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, NA, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, NA, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, NA, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, NA
),
nrow = 10,
dimnames = list(researchers, researchers)
)
# 共著ネットワーク
collaboration |>
as.data.frame() |>
rownames_to_column() |>
pivot_longer(!rowname) |>
mutate(
value = factor(value),
name = factor(name, levels = rev(LETTERS[1:10]))
) |>
ggplot(aes(x = rowname, y = name, fill = value, label = value)) +
geom_tile() +
geom_text() +
scale_fill_hue(name = "", labels = c("0" = "指導関係なし", "1" ="指導関係あり")) +
labs(title = "隣接行列表示", x = "指導された研究者", y = "指導した研究者") +
theme(aspect.ratio = 1)library(conflicted)
library(tidyverse)
library(igraph)
# 研究者リスト
researchers <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
# 共著データ
collaboration <- matrix(
c(
NA, 1, 1, 0, 0, 0, 0, 0, 0, 1,
0, NA, 0, 1, 1, 1, 0, 0, 0, 0,
0, 0, NA, 0, 0, 0, 1, 1, 1, 0,
0, 0, 0, NA, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, NA, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, NA, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, NA, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, NA, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, NA, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, NA
),
nrow = 10,
dimnames = list(researchers, researchers)
)
diag(collaboration) <- 0
collaboration |>
t() |>
graph_from_adjacency_matrix(mode = "directed") |>
plot(layout = layout_as_tree)circoレイアウトがigraphには無いようなのでMDSレイアウトで代用。
library(conflicted)
library(tidyverse)
library(igraph)
set.seed(0)
G_er <- erdos.renyi.game(30, 0.2)
G_ws <- watts.strogatz.game(1, 30, 5, 0.1)
G_ba <- barabasi.game(30, 1)
ErdosR <- as_adjacency_matrix(G_er, sparse = FALSE)
WattsStrogatz <- as_adjacency_matrix(G_ws, sparse = FALSE)
BarabasiAlbert <- as_adjacency_matrix(G_ba, sparse = FALSE)
net1 <- graph_from_adjacency_matrix(ErdosR, mode = "max")
net2 <- graph_from_adjacency_matrix(WattsStrogatz, mode = "max")
net3 <- graph_from_adjacency_matrix(BarabasiAlbert, mode = "max")
par(mfrow = c(3,3))
plot(net1, layout = layout_in_circle,
main = "Erdos-Reyni: Circular Layout", vertex.size = 9, vertex.label=NA, )
plot(net2, layout = layout_in_circle,
main = "Watts-Strogatz: Circular Layout", vertex.size = 9, vertex.label=NA)
plot(net3, layout = layout_in_circle,
main = "Barabasi-Albert: Circular Layout", vertex.size = 9, vertex.label=NA)
plot(net1, layout = layout_with_mds,
main = "Erdos-Reyni: MDS Layout", vertex.size = 9, vertex.label=NA)
plot(net2, layout = layout_with_mds,
main = "Watts-Strogatz: MDS Layout", vertex.size = 9, vertex.label=NA)
plot(net3, layout = layout_with_mds,
main = "Barabasi-Albert: MDS Layout", vertex.size = 9, vertex.label=NA)
plot(net1, layout = layout_with_kk(net1),
main = "Erdos-Reyni: KK Layout", vertex.size = 9, vertex.label=NA)
plot(net2, layout = layout_with_kk(net2),
main = "Watts-Strogatz: KK Layout", vertex.size = 9, vertex.label=NA)
plot(net3, layout = layout_with_kk(net3),
main = "Barabasi-Albert: KK Layout", vertex.size = 9, vertex.label=NA)dotとcircoが無いのでtree, mdsで代用
library(conflicted)
library(tidyverse)
library(igraph)
# ランダム有向グラフを生成
set.seed(0)
G_dir_random <- sample_gnm(30, 81, directed = TRUE)
# 階層構造を持つ有向グラフを生成
set.seed(0)
G_dir_hierarchy <- sample_tree(30, directed = TRUE)
my_plot <- function(data, layout){
plot(
data, layout = layout,
vertex.color="yellow",
vertex.label = NA,
edge.arrow.size = 0.5
)
}
par(mfrow = c(3,2))
my_plot(G_dir_random, layout_as_tree)
my_plot(G_dir_hierarchy, layout_as_tree)
my_plot(G_dir_random, layout_with_mds)
my_plot(G_dir_hierarchy, layout_with_mds)
my_plot(G_dir_random, layout_with_kk)
my_plot(G_dir_hierarchy, layout_with_kk)library(conflicted)
library(tidyverse)
library(pheatmap)
df <- read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
col_names = c(
"ブドウの品種", "アルコール度数", "リンゴ酸", "ミネラル分",
"ミネラル分のアルカリ度", "マグネシウム", "全フェノール類", "フラバノイド",
"非フラバノイドフェノール類", "プロアントシアニン", "色の強さ", "色相",
"OD280/OD315値", "プロリン"
),
col_types = "f"
) |>
mutate(across(where(is.numeric), \(x) scale(x))) #標準化
jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
pheatmap(
df |>
dplyr::select(!ブドウの品種) |>
t(),
color = jet.colors(50),
clustering_method = "ward.D2"
)BIRCHだけ見つからないので省略。
ちなみにRではmlbenchパッケージにベンチマーク用の様々なデータセットと人工データ生成用の関数が用意されています。
library(conflicted)
library(tidyverse)
library(ClusterR) # MiniBatch KMeans
library(apcluster) # Affinity Propagation
library(meanShiftR) # MeanShift
library(skmeans) # Spectral Clustering
library(cluster) # Agglomerative Clustering
library(dbscan) # DBSCAN, HDBSCAN, OPTICS,
library(mclust) # Gaussian Mixture
library(patchwork)
# データ用意
my_read_csv <- function(file){
read_csv(file, col_names = c("x", "y", "class")) |>
mutate( #標準化しておく
x = (x - mean(x))/sd(x),
y = (y - mean(x))/sd(x),
class = factor(class + 1))
}
# 円形のクラスタ
noisy_circles <- my_read_csv(
"https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/noisy_circles.csv"
)
# 月型のクラスタ
noisy_moons <- my_read_csv(
"https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/noisy_moons.csv"
)
# 正規分布に従うクラスタ
blobs <- my_read_csv(
"https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/blobs.csv"
)
# 異方性のあるデータ
aniso <- my_read_csv(
"https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/aniso.csv"
)
# 3つの正規分布に従うデータ
varied <- my_read_csv(
"https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/varied.csv"
)
n <- 500
set.seed(0)
no_structure <- data.frame(x = runif(n), y = runif(n), class = factor("0"))# 構造のないデータ
# 各手法のラッパー関数を用意
# MiniBatch KMeans
my_MiniBatchKmeans <- function(data, cluster_num) {
data_cleaned <- data[, 1:2]
start <- Sys.time()
fit <- MiniBatchKmeans(data_cleaned, clusters = cluster_num)
pred <- predict(fit, data_cleaned)
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
# Affinity Propagation
my_apcluster <- function(data, cluster_num) {
data_cleaned <- data[, 1:2]
start <- Sys.time()
pred <- apcluster(s = negDistMat(r=2), x = data_cleaned, p = -200, q = 0.9) |>
cutree(cluster_num) |>
labels(type = "enum")
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
# MeanShift
my_meanShift <- function(data) {
data_cleaned <- as.matrix(data[, 1:2])
start <- Sys.time()
pred <- meanShift(data_cleaned, data_cleaned)$assignment
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
# Spectral Clustering
my_skmeans <- function(data, cluster_num) {
data_cleaned <- base::as.matrix(data[, 1:2])
start <- Sys.time()
pred <- skmeans(data_cleaned, k = cluster_num)$cluster
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
# Ward
my_hclust <- function(data, cluster_num) {
data_cleaned <- data[, 1:2]
start <- Sys.time()
pred <- data_cleaned |>
stats::dist() |>
hclust(method = "ward.D2") |>
cutree(k = cluster_num)
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
# Agglomerative Clustering
my_agnes <- function(data, cluster_num) {
data_cleaned <- data[, 1:2]
start <- Sys.time()
pred <- agnes(data_cleaned) |> cutree(k = cluster_num)
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
# DBSCAN
my_dbscan <- function(data) {
data_cleaned <- data[, 1:2]
start <- Sys.time()
pred <- dbscan::dbscan(data_cleaned, eps = 0.3)$cluster
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
# HDBSCAN
my_hdbscan <- function(data) {
data_cleaned <- data[, 1:2]
start <- Sys.time()
pred <- dbscan::hdbscan(data_cleaned, minPts = 15)$cluster
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
# OPTICS
my_optics <- function(data) {
data_cleaned <- data[, 1:2]
start <- Sys.time()
pred <- dbscan::optics(data_cleaned, eps = 0.1, minPts = 7) |>
extractXi(xi = 0.05) |>
pluck("cluster")
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
# Gaussian Mixture
my_Mclust <- function(data) {
data_cleaned <- data[, 1:2]
start <- Sys.time()
pred <- mclust::Mclust(data_cleaned, G = 1:3)$classification
end <- Sys.time()
diff <- end - start
data$pred <- as.factor(pred)
return(list(res = data, diff = diff))
}
### クラスタリング実行
dats <- list(noisy_circles, noisy_moons, varied, aniso, blobs, no_structure)
cnums <- list(2,2,3,3,3,3)
set.seed(0)
res_clustering <- list(
res_MiniBatchKmeans = map2(dats, cnums, \(dats, cnums) my_MiniBatchKmeans(dats, cnums)),
res_apcluster = map2(dats, cnums, \(dats, cnums) my_apcluster(dats, cnums)),
res_meanShift = purrr::map(dats, \(dats) my_meanShift(dats)),
res_skmeans = map2(dats, cnums, \(dats, cnums) my_skmeans(dats, cnums)),
res_hclust = map2(dats, cnums, \(dats, cnums) my_hclust(dats, cnums)),
res_agnes = map2(dats, cnums, \(dats, cnums) my_agnes(dats, cnums)),
res_dbscan = purrr::map(dats, \(dats) my_dbscan(dats)),
res_hdbscan = purrr::map(dats, \(dats) my_hdbscan(dats)),
res_optics = purrr::map(dats, \(dats) my_optics(dats)),
res_Mclust = purrr::map(dats, \(dats) my_Mclust(dats))
)
# 描画用ラッパー関数を用意
my_plot <- function(result) {
result$res |>
ggplot(aes(x = x, y = y, color = pred)) +
geom_point(size = 1) +
labs(caption = paste0(round(result$diff,3),"s")) +
theme(
axis.ticks = element_blank(), # tickの線を消す
axis.text = element_blank(), # tickの数字を消す
axis.title = element_blank(), # 軸のラベルを消す
axis.line = element_blank(), # 軸の線を消す
legend.position="none",
aspect.ratio = 1
)
}
nums <- expand_grid(d = 1:6, m = 1:10)
res_plot <- map2(nums$m, nums$d, \(.x, .y) my_plot(res_clustering[[.x]][[.y]]))
# ここがダサい……もうちょっとどうにかならないか
res_plot[[1]] + res_plot[[2]] +res_plot[[3]] +res_plot[[4]] +res_plot[[5]] +res_plot[[6]] +
res_plot[[7]] + res_plot[[8]] +res_plot[[9]] +res_plot[[10]] +res_plot[[11]] +res_plot[[12]] +
res_plot[[13]] + res_plot[[14]] +res_plot[[15]] +res_plot[[16]] +res_plot[[17]] +res_plot[[18]] +
res_plot[[19]] + res_plot[[20]] +res_plot[[21]] +res_plot[[22]] +res_plot[[23]] +res_plot[[24]] +
res_plot[[25]] + res_plot[[26]] +res_plot[[27]] +res_plot[[28]] +res_plot[[29]] +res_plot[[30]] +
res_plot[[31]] + res_plot[[32]] +res_plot[[33]] +res_plot[[34]] +res_plot[[35]] +res_plot[[36]] +
res_plot[[37]] + res_plot[[38]] +res_plot[[39]] +res_plot[[40]] +res_plot[[41]] +res_plot[[42]] +
res_plot[[43]] + res_plot[[44]] +res_plot[[45]] +res_plot[[46]] +res_plot[[47]] +res_plot[[48]] +
res_plot[[49]] + res_plot[[50]] +res_plot[[51]] +res_plot[[52]] +res_plot[[53]] +res_plot[[54]] +
res_plot[[55]] + res_plot[[56]] +res_plot[[57]] +res_plot[[58]] +res_plot[[59]] +res_plot[[60]] +
plot_layout(ncol = 10)library(conflicted)
library(tidyverse)
library(GGally)
df <- read_csv(
"https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/df_434.csv"
)
df |>
ggpairs()library(conflicted)
library(tidyverse)
library(skmeans)
library(GGally)
library(patchwork)
df <- read_csv(
"https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/df_434.csv"
)
# 主成分分析(標準化)
pca_result <- prcomp(df)#, scale. = TRUE)
pca_importance <- as.data.frame(summary(pca_result)$importance[2,]) |>
rownames_to_column()
names(pca_importance) <- c("pc", "value")
# Spectral Clustering
clusters_pca <- skmeans(pca_result$x[, 1:2], 3)
p1 <- data.frame(
pc1 = pca_result$x[, 1],
pc2 = pca_result$x[, 2],
class = factor(clusters_pca$cluster)
) |>
ggplot(aes(x = pc1, y = pc2, color = class)) +
geom_point() +
labs(title = "二つの主成分で見る")+
theme(legend.position="none", aspect.ratio = 1)
p2 <- pca_importance |>
ggplot(aes(x = reorder(pc, desc(value)), y = value)) +
geom_col()+
labs(title = "各主成分のデータ悦明力", x = "", y = "")+
theme(aspect.ratio = 1)
p1 + p2library(conflicted)
library(tidyverse)
library(MASS) # MDS(sammon)
library(Rtsne) # t-SNE
library(umap) # UMAP
df <- read_csv(
"https://raw.githubusercontent.com/morimotoosamu/data_visualization/main/data/digits.csv"
)
df_cleaned <- df |>
dplyr::select(!target)
# 各手法で2次元に圧縮
# 主成分分析
X_pca <- prcomp(df_cleaned)$x[, 1:2]
# t-SNE
set.seed(42)
X_tsne <- Rtsne(df_cleaned, num_threads = 2)$Y
# MDS
X_mds <- df_cleaned |>
dist() |>
sammon(trace = FALSE) |>
pluck("points")
# UMAP
set.seed(42)
X_umap <- umap(df_cleaned)$layout
# K-means
set.seed(42)
clusters_pca <- kmeans(X_pca, 10)$cluster # PCA
set.seed(42)
clusters_mds <- kmeans(X_mds, 10)$cluster #MDS
set.seed(42)
clusters_tsne <- kmeans(X_tsne, 10)$cluster # t-SNE
set.seed(42)
clusters_umap <- kmeans(X_umap, 10)$cluster # UMAP
# 結果をデータフレームにまとめる
df_base <- data.frame(
x = c(X_pca[, 1], X_mds[, 1], X_tsne[, 1], X_umap[, 1]),
y = c(X_pca[, 2], X_mds[, 2], X_tsne[, 2], X_umap[, 2])
)
n <- 1797
dimred <- bind_rows(
df_base |>
mutate(
label = rep(df$target, 4),
method = c(rep("pca_label", n), rep("mds_label", n),rep("tsne_label", n),rep("umap_label", n))
),
df_base |>
mutate(
label = c(clusters_pca, clusters_mds, clusters_tsne, clusters_umap),
method = c(rep("pca_kmeans", n), rep("mds_kmeans", n),rep("tsne_kmeans", n),rep("umap_kmeans", n))
)
) |>
mutate(
method = factor(
method,
levels = c("pca_kmeans", "pca_label", "mds_kmeans", "mds_label",
"tsne_kmeans", "tsne_label", "umap_kmeans", "umap_label")),
label = factor(label)
)
# クラスタリング結果と正解ラベルの描画
dimred |>
ggplot(aes(x = x, y = y, color = label)) +
geom_point() +
labs(title="様々な次元圧縮方法") +
theme(
axis.ticks = element_blank(), # tickの線を消す
axis.text = element_blank(), # tickの数字を消す
axis.title = element_blank(), # 軸のラベルを消す
axis.line = element_blank(), # 軸の線を消す
legend.position="none",
aspect.ratio = 1
) +
facet_wrap(vars(method), nrow = 2, scales = "free")第4章はここまで。